<!DOCTYPE html>
<html lang="en-us">
  <head>

    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    
<meta charset="UTF-8">
<title>Create data frame analytics jobs API | Elasticsearch Guide [7.7] | Elastic</title>
<link rel="home" href="index.html" title="Elasticsearch Guide [7.7]">
<link rel="up" href="ml-df-analytics-apis.html" title="Machine learning data frame analytics APIs">
<link rel="prev" href="ml-df-analytics-apis.html" title="Machine learning data frame analytics APIs">
<link rel="next" href="put-inference.html" title="Create inference trained model API">
<meta name="DC.type" content="Learn/Docs/Elasticsearch/Reference/7.7">
<meta name="DC.subject" content="Elasticsearch">
<meta name="DC.identifier" content="7.7">
<meta name="robots" content="noindex,nofollow">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <script src="https://cdn.optimizely.com/js/18132920325.js"></script>
    <link rel="apple-touch-icon" sizes="57x57" href="/apple-icon-57x57.png">
    <link rel="apple-touch-icon" sizes="60x60" href="/apple-icon-60x60.png">
    <link rel="apple-touch-icon" sizes="72x72" href="/apple-icon-72x72.png">
    <link rel="apple-touch-icon" sizes="76x76" href="/apple-icon-76x76.png">
    <link rel="apple-touch-icon" sizes="114x114" href="/apple-icon-114x114.png">
    <link rel="apple-touch-icon" sizes="120x120" href="/apple-icon-120x120.png">
    <link rel="apple-touch-icon" sizes="144x144" href="/apple-icon-144x144.png">
    <link rel="apple-touch-icon" sizes="152x152" href="/apple-icon-152x152.png">
    <link rel="apple-touch-icon" sizes="180x180" href="/apple-icon-180x180.png">
    <link rel="icon" type="image/png" href="/favicon-32x32.png" sizes="32x32">
    <link rel="icon" type="image/png" href="/android-chrome-192x192.png" sizes="192x192">
    <link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
    <link rel="icon" type="image/png" href="/favicon-16x16.png" sizes="16x16">
    <link rel="manifest" href="/manifest.json">
    <meta name="apple-mobile-web-app-title" content="Elastic">
    <meta name="application-name" content="Elastic">
    <meta name="msapplication-TileColor" content="#ffffff">
    <meta name="msapplication-TileImage" content="/mstile-144x144.png">
    <meta name="theme-color" content="#ffffff">
    <meta name="naver-site-verification" content="936882c1853b701b3cef3721758d80535413dbfd">
    <meta name="yandex-verification" content="d8a47e95d0972434">
    <meta name="localized" content="true">
    <meta name="st:robots" content="follow,index">
    <meta property="og:image" content="https://www.elastic.co/static/images/elastic-logo-200.png">
    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
    <link rel="icon" href="/favicon.ico" type="image/x-icon">
    <link rel="apple-touch-icon-precomposed" sizes="64x64" href="/favicon_64x64_16bit.png">
    <link rel="apple-touch-icon-precomposed" sizes="32x32" href="/favicon_32x32.png">
    <link rel="apple-touch-icon-precomposed" sizes="16x16" href="/favicon_16x16.png">
    <!-- Give IE8 a fighting chance -->
    <!--[if lt IE 9]>
    <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
    <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
    <![endif]-->
    <link rel="stylesheet" type="text/css" href="/guide/static/styles.css">
  </head>

  <!--© 2015-2021 Elasticsearch B.V. Copying, publishing and/or distributing without written permission is strictly prohibited.-->

  <body>
    <!-- Google Tag Manager -->
    <script>dataLayer = [];</script><noscript><iframe src="//www.googletagmanager.com/ns.html?id=GTM-58RLH5" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
    <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= '//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-58RLH5');</script>
    <!-- End Google Tag Manager -->

    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-12395217-16"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
      gtag('config', 'UA-12395217-16');
    </script>

    <!--BEGIN QUALTRICS WEBSITE FEEDBACK SNIPPET-->
    <script type="text/javascript">
      (function(){var g=function(e,h,f,g){
      this.get=function(a){for(var a=a+"=",c=document.cookie.split(";"),b=0,e=c.length;b<e;b++){for(var d=c[b];" "==d.charAt(0);)d=d.substring(1,d.length);if(0==d.indexOf(a))return d.substring(a.length,d.length)}return null};
      this.set=function(a,c){var b="",b=new Date;b.setTime(b.getTime()+6048E5);b="; expires="+b.toGMTString();document.cookie=a+"="+c+b+"; path=/; "};
      this.check=function(){var a=this.get(f);if(a)a=a.split(":");else if(100!=e)"v"==h&&(e=Math.random()>=e/100?0:100),a=[h,e,0],this.set(f,a.join(":"));else return!0;var c=a[1];if(100==c)return!0;switch(a[0]){case "v":return!1;case "r":return c=a[2]%Math.floor(100/c),a[2]++,this.set(f,a.join(":")),!c}return!0};
      this.go=function(){if(this.check()){var a=document.createElement("script");a.type="text/javascript";a.src=g;document.body&&document.body.appendChild(a)}};
      this.start=function(){var a=this;window.addEventListener?window.addEventListener("load",function(){a.go()},!1):window.attachEvent&&window.attachEvent("onload",function(){a.go()})}};
      try{(new g(100,"r","QSI_S_ZN_emkP0oSe9Qrn7kF","https://znemkp0ose9qrn7kf-elastic.siteintercept.qualtrics.com/WRSiteInterceptEngine/?Q_ZID=ZN_emkP0oSe9Qrn7kF")).start()}catch(i){}})();
    </script><div id="ZN_emkP0oSe9Qrn7kF"><!--DO NOT REMOVE-CONTENTS PLACED HERE--></div>
    <!--END WEBSITE FEEDBACK SNIPPET-->

    <div id="elastic-nav" style="display:none;"></div>
    <script src="https://www.elastic.co/elastic-nav.js"></script>

    <!-- Subnav -->
    <div>
      <div>
        <div class="tertiary-nav d-none d-md-block">
          <div class="container">
            <div class="p-t-b-15 d-flex justify-content-between nav-container">
              <div class="breadcrum-wrapper"><span><a href="/guide/" style="font-size: 14px; font-weight: 600; color: #000;">Docs</a></span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="main-container">
      <section id="content">
        <div class="content-wrapper">

          <section id="guide" lang="en">
            <div class="container">
              <div class="row">
                <div class="col-xs-12 col-sm-8 col-md-8 guide-section">
                  <!-- start body -->
                  <div class="page_header">
<strong>IMPORTANT</strong>: No additional bug fixes or documentation updates
will be released for this version. For the latest information, see the
<a href="../current/index.html">current release documentation</a>.
</div>
<div id="content">
<div class="breadcrumbs">
<span class="breadcrumb-link"><a href="index.html">Elasticsearch Guide [7.7]</a></span>
»
<span class="breadcrumb-link"><a href="rest-apis.html">REST APIs</a></span>
»
<span class="breadcrumb-link"><a href="ml-df-analytics-apis.html">Machine learning data frame analytics APIs</a></span>
»
<span class="breadcrumb-node">Create data frame analytics jobs API</span>
</div>
<div class="navheader">
<span class="prev">
<a href="ml-df-analytics-apis.html">« Machine learning data frame analytics APIs</a>
</span>
<span class="next">
<a href="put-inference.html">Create inference trained model API »</a>
</span>
</div>
<div class="section xpack">
<div class="titlepage"><div><div>
<h2 class="title">
<a id="put-dfanalytics"></a>Create data frame analytics jobs API<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch/edit/7.7/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc">edit</a><a class="xpack_tag" href="/subscriptions"></a>
</h2>
</div></div></div>

<p>Instantiates a data frame analytics job.</p>
<div class="warning admon">
<div class="icon"></div>
<div class="admon_content">
<p>This functionality is experimental and may be changed or removed completely in a future release. Elastic will take a best effort approach to fix any issues, but experimental features are not subject to the support SLA of official GA features.</p>
</div>
</div>
<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="ml-put-dfanalytics-request"></a>Request<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch/edit/7.7/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc">edit</a>
</h3>
</div></div></div>
<p><code class="literal">PUT _ml/data_frame/analytics/&lt;data_frame_analytics_id&gt;</code></p>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="ml-put-dfanalytics-prereq"></a>Prerequisites<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch/edit/7.7/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc">edit</a>
</h3>
</div></div></div>
<p>If the Elasticsearch security features are enabled, you must have the following built-in roles and privileges:</p>
<div class="ulist itemizedlist">
<ul class="itemizedlist">
<li class="listitem">
<code class="literal">machine_learning_admin</code>
</li>
<li class="listitem">
<code class="literal">kibana_admin</code> (UI only)
</li>
<li class="listitem">
source indices: <code class="literal">read</code>, <code class="literal">view_index_metadata</code>
</li>
<li class="listitem">
destination index: <code class="literal">read</code>, <code class="literal">create_index</code>, <code class="literal">manage</code> and <code class="literal">index</code>
</li>
<li class="listitem">
cluster: <code class="literal">monitor</code> (UI only)
</li>
</ul>
</div>
<p>For more information, see <a class="xref" href="security-privileges.html" title="Security privileges">Security privileges</a> and <a class="xref" href="built-in-roles.html" title="Built-in roles">Built-in roles</a>.</p>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="ml-put-dfanalytics-desc"></a>Description<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch/edit/7.7/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc">edit</a>
</h3>
</div></div></div>
<p>This API creates a data frame analytics job that performs an analysis on the source
indices and stores the outcome in a destination index.</p>
<p>If the destination index does not exist, it is created automatically when you
start the job. See <a class="xref" href="start-dfanalytics.html" title="Start data frame analytics jobs API">Start data frame analytics jobs</a>.</p>
<p>If you supply only a subset of the regression or classification parameters,
<a href="/guide/en/machine-learning/7.7/hyperparameters.html" class="ulink" target="_top">hyperparameter optimization</a> occurs.
It determines a value for each of the undefined parameters.</p>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="ml-put-dfanalytics-path-params"></a>Path parameters<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch/edit/7.7/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc">edit</a>
</h3>
</div></div></div>
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">&lt;data_frame_analytics_id&gt;</code>
</span>
</dt>
<dd>
(Required, string)
Identifier for the data frame analytics job. This identifier can contain lowercase
alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start
and end with alphanumeric characters.
</dd>
</dl>
</div>
</div>

<div class="section child_attributes">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="ml-put-dfanalytics-request-body"></a>Request body<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch/edit/7.7/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc">edit</a>
</h3>
</div></div></div>
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">allow_lazy_start</code>
</span>
</dt>
<dd>
(Optional, boolean)
Specifies whether this job can start when there is insufficient machine learning node
capacity for it to be immediately assigned to a node. The default is <code class="literal">false</code>; if
a machine learning node with capacity to run the job cannot immediately be found, the
<a class="xref" href="start-dfanalytics.html" title="Start data frame analytics jobs API">Start data frame analytics jobs</a> API returns an error. However, this is also subject to the
cluster-wide <code class="literal">xpack.ml.max_lazy_ml_nodes</code> setting. See <a class="xref" href="ml-settings.html#advanced-ml-settings" title="Advanced machine learning settings">Advanced machine learning settings</a>.
If this option is set to <code class="literal">true</code>, the API does not return an error and the job
waits in the <code class="literal">starting</code> state until sufficient machine learning node capacity is available.
</dd>
</dl>
</div>
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">analysis</code>
</span>
</dt>
<dd>
<p>
(Required, object)
The analysis configuration, which contains the information necessary to perform
one of the following types of analysis: classification, outlier detection, or
regression.
</p>
<details open>
<summary class="title">Properties of <code class="literal">analysis</code></summary>
<div class="content">
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">classification</code>
</span>
</dt>
<dd>
<p>
(Required<sup>*</sup>, object)
The configuration information necessary to perform
<a href="/guide/en/machine-learning/7.7/dfa-classification.html" class="ulink" target="_top">classification</a>.
</p>
<div class="tip admon">
<div class="icon"></div>
<div class="admon_content">
<p>Advanced parameters are for fine-tuning classification analysis. They are set
automatically by hyperparameter optimization to give the minimum validation
error. It is highly recommended to use the default values unless you fully
understand the function of these parameters.</p>
</div>
</div>
<details open>
<summary class="title">Properties of <code class="literal">classification</code></summary>
<div class="content">
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">class_assignment_objective</code>
</span>
</dt>
<dd>
(Optional, string)
Defines the objective to optimize when assigning class labels:
<code class="literal">maximize_accuracy</code> or <code class="literal">maximize_minimum_recall</code>. When maximizing accuracy,
class labels are chosen to maximize the number of correct predictions. When
maximizing minimum recall, labels are chosen to maximize the minimum recall
for any class. Defaults to <code class="literal">maximize_minimum_recall</code>.
</dd>
<dt>
<span class="term">
<code class="literal">dependent_variable</code>
</span>
</dt>
<dd>
<p>
(Required, string)
</p>
<p>Defines which field of the document is to be predicted.
This parameter is supplied by field name and must match one of the fields in
the index being used to train. If this field is missing from a document, then
that document will not be used for training, but a prediction with the trained
model will be generated for it. It is also known as continuous target variable.</p>
<p>The data type of the field must be numeric (<code class="literal">integer</code>, <code class="literal">short</code>, <code class="literal">long</code>, <code class="literal">byte</code>),
categorical (<code class="literal">ip</code> or <code class="literal">keyword</code>), or boolean. There must be no more than 30
different values in this field.</p>
</dd>
<dt>
<span class="term">
<code class="literal">eta</code>
</span>
</dt>
<dd>
(Optional, double)
Advanced configuration option. The shrinkage applied to the weights. Smaller
values result in larger forests which have a better generalization error.
However, the smaller the value the longer the training will take. For more
information, about shrinkage, see
<a href="https://en.wikipedia.org/wiki/Gradient_boosting#Shrinkage" class="ulink" target="_top">this wiki article</a>.
</dd>
<dt>
<span class="term">
<code class="literal">feature_bag_fraction</code>
</span>
</dt>
<dd>
(Optional, double)
Advanced configuration option. Defines the fraction of features that will be
used when selecting a random bag for each candidate split. By default, this
value is calculated during hyperparameter optimization.
</dd>
<dt>
<span class="term">
<code class="literal">gamma</code>
</span>
</dt>
<dd>
(Optional, double)
Advanced configuration option. Regularization parameter to prevent overfitting
on the training data set. Multiplies a linear penalty associated with the size of
individual trees in the forest. The higher the value the more training will
prefer smaller trees. The smaller this parameter the larger individual trees
will be and the longer training will take. By default, this value is calculated
during hyperparameter optimization.
</dd>
<dt>
<span class="term">
<code class="literal">lambda</code>
</span>
</dt>
<dd>
(Optional, double)
Advanced configuration option. Regularization parameter to prevent overfitting
on the training data set. Multiplies an L2 regularisation term which applies to
leaf weights of the individual trees in the forest. The higher the value the
more training will attempt to keep leaf weights small. This makes the prediction
function smoother at the expense of potentially not being able to capture
relevant relationships between the features and the dependent variable. The smaller this
parameter the larger individual trees will be and the longer training will take.
By default, this value is calculated during hyperparameter optimization.
</dd>
<dt>
<span class="term">
<code class="literal">max_trees</code>
</span>
</dt>
<dd>
(Optional, integer)
Advanced configuration option. Defines the maximum number of trees the forest is
allowed to contain. The maximum value is 2000. By default, this value is
calculated during hyperparameter optimization.
</dd>
<dt>
<span class="term">
<code class="literal">num_top_classes</code>
</span>
</dt>
<dd>
(Optional, integer)
Defines the number of categories for which the predicted probabilities are
reported. It must be non-negative. If it is greater than the total number of
categories, the API reports all category probabilities. Defaults to 2.
</dd>
<dt>
<span class="term">
<code class="literal">num_top_feature_importance_values</code>
</span>
</dt>
<dd>
(Optional, integer)
Advanced configuration option. Specifies the maximum number of
<a href="/guide/en/machine-learning/7.7/ml-feature-importance.html" class="ulink" target="_top">feature importance</a> values per document to return.
By default, it is zero and no feature importance calculation occurs.
</dd>
<dt>
<span class="term">
<code class="literal">prediction_field_name</code>
</span>
</dt>
<dd>
(Optional, string)
Defines the name of the prediction field in the results.
Defaults to <code class="literal">&lt;dependent_variable&gt;_prediction</code>.
</dd>
<dt>
<span class="term">
<code class="literal">randomize_seed</code>
</span>
</dt>
<dd>
(Optional, long)
Defines the seed to the random generator that is used to pick
which documents will be used for training. By default it is randomly generated.
Set it to a specific value to ensure the same documents are used for training
assuming other related parameters (e.g. <code class="literal">source</code>, <code class="literal">analyzed_fields</code>, etc.) are
the same.
</dd>
<dt>
<span class="term">
<code class="literal">training_percent</code>
</span>
</dt>
<dd>
(Optional, integer)
Defines what percentage of the eligible documents that will
be used for training. Documents that are ignored by the analysis (for example
those that contain arrays with more than one value) won’t be included in the
calculation for used percentage. Defaults to <code class="literal">100</code>.
</dd>
</dl>
</div>
</div>
</details>
</dd>
<dt>
<span class="term">
<code class="literal">outlier_detection</code>
</span>
</dt>
<dd>
<p>
(Required<sup>*</sup>, object)
The configuration information necessary to perform
<a href="/guide/en/machine-learning/7.7/dfa-outlier-detection.html" class="ulink" target="_top">outlier detection</a>:
</p>
<details open>
<summary class="title">Properties of <code class="literal">outlier_detection</code></summary>
<div class="content">
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">compute_feature_influence</code>
</span>
</dt>
<dd>
(Optional, boolean)
Specifies whether the feature influence calculation is enabled. Defaults to
<code class="literal">true</code>.
</dd>
<dt>
<span class="term">
<code class="literal">feature_influence_threshold</code>
</span>
</dt>
<dd>
(Optional, double)
The minimum outlier score that a document needs to have in order to calculate its
feature influence score. Value range: 0-1 (<code class="literal">0.1</code> by default).
</dd>
<dt>
<span class="term">
<code class="literal">method</code>
</span>
</dt>
<dd>
(Optional, string)
The method that outlier detection uses. Available methods are <code class="literal">lof</code>, <code class="literal">ldof</code>,
<code class="literal">distance_kth_nn</code>, <code class="literal">distance_knn</code>, and <code class="literal">ensemble</code>. The default value is
<code class="literal">ensemble</code>, which means that outlier detection uses an ensemble of different methods
and normalises and combines their individual outlier scores to obtain the overall
outlier score.
</dd>
<dt>
<span class="term">
<code class="literal">n_neighbors</code>
</span>
</dt>
<dd>
(Optional, integer)
Defines the value for how many nearest neighbors each method of outlier detection
uses to calculate its outlier score. When the value is not set, different values are
used for different ensemble members. This default behavior helps improve the
diversity in the ensemble; only override it if you are confident that the value
you choose is appropriate for the data set.
</dd>
<dt>
<span class="term">
<code class="literal">outlier_fraction</code>
</span>
</dt>
<dd>
(Optional, double)
The proportion of the data set that is assumed to be outlying prior to
outlier detection. For example, 0.05 means it is assumed that 5% of values are real
outliers and 95% are inliers.
</dd>
<dt>
<span class="term">
<code class="literal">standardization_enabled</code>
</span>
</dt>
<dd>
(Optional, boolean)
If <code class="literal">true</code>, the following operation is performed on the columns before computing
outlier scores: (x_i - mean(x_i)) / sd(x_i). Defaults to <code class="literal">true</code>. For more
information about this concept, see
<a href="https://en.wikipedia.org/wiki/Feature_scaling#Standardization_(Z-score_Normalization)" class="ulink" target="_top">Wikipedia</a>.
</dd>
</dl>
</div>
</div>
</details>
</dd>
<dt>
<span class="term">
<code class="literal">regression</code>
</span>
</dt>
<dd>
<p>
(Required<sup>*</sup>, object)
The configuration information necessary to perform
<a href="/guide/en/machine-learning/7.7/dfa-regression.html" class="ulink" target="_top">regression</a>.
</p>
<div class="tip admon">
<div class="icon"></div>
<div class="admon_content">
<p>Advanced parameters are for fine-tuning regression analysis. They are set
automatically by hyperparameter optimization to give the minimum validation
error. It is highly recommended to use the default values unless you fully
understand the function of these parameters.</p>
</div>
</div>
<details open>
<summary class="title">Properties of <code class="literal">regression</code></summary>
<div class="content">
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">dependent_variable</code>
</span>
</dt>
<dd>
<p>
(Required, string)
</p>
<p>Defines which field of the document is to be predicted.
This parameter is supplied by field name and must match one of the fields in
the index being used to train. If this field is missing from a document, then
that document will not be used for training, but a prediction with the trained
model will be generated for it. It is also known as continuous target variable.</p>
<p>The data type of the field must be numeric.</p>
</dd>
<dt>
<span class="term">
<code class="literal">eta</code>
</span>
</dt>
<dd>
(Optional, double)
Advanced configuration option. The shrinkage applied to the weights. Smaller
values result in larger forests which have a better generalization error.
However, the smaller the value the longer the training will take. For more
information, about shrinkage, see
<a href="https://en.wikipedia.org/wiki/Gradient_boosting#Shrinkage" class="ulink" target="_top">this wiki article</a>.
</dd>
<dt>
<span class="term">
<code class="literal">feature_bag_fraction</code>
</span>
</dt>
<dd>
(Optional, double)
Advanced configuration option. Defines the fraction of features that will be
used when selecting a random bag for each candidate split. By default, this
value is calculated during hyperparameter optimization.
</dd>
<dt>
<span class="term">
<code class="literal">gamma</code>
</span>
</dt>
<dd>
(Optional, double)
Advanced configuration option. Regularization parameter to prevent overfitting
on the training data set. Multiplies a linear penalty associated with the size of
individual trees in the forest. The higher the value the more training will
prefer smaller trees. The smaller this parameter the larger individual trees
will be and the longer training will take. By default, this value is calculated
during hyperparameter optimization.
</dd>
<dt>
<span class="term">
<code class="literal">lambda</code>
</span>
</dt>
<dd>
(Optional, double)
Advanced configuration option. Regularization parameter to prevent overfitting
on the training data set. Multiplies an L2 regularisation term which applies to
leaf weights of the individual trees in the forest. The higher the value the
more training will attempt to keep leaf weights small. This makes the prediction
function smoother at the expense of potentially not being able to capture
relevant relationships between the features and the dependent variable. The smaller this
parameter the larger individual trees will be and the longer training will take.
By default, this value is calculated during hyperparameter optimization.
</dd>
<dt>
<span class="term">
<code class="literal">max_trees</code>
</span>
</dt>
<dd>
(Optional, integer)
Advanced configuration option. Defines the maximum number of trees the forest is
allowed to contain. The maximum value is 2000. By default, this value is
calculated during hyperparameter optimization.
</dd>
<dt>
<span class="term">
<code class="literal">num_top_feature_importance_values</code>
</span>
</dt>
<dd>
(Optional, integer)
Advanced configuration option. Specifies the maximum number of
<a href="/guide/en/machine-learning/7.7/ml-feature-importance.html" class="ulink" target="_top">feature importance</a> values per document to return.
By default, it is zero and no feature importance calculation occurs.
</dd>
<dt>
<span class="term">
<code class="literal">prediction_field_name</code>
</span>
</dt>
<dd>
(Optional, string)
Defines the name of the prediction field in the results.
Defaults to <code class="literal">&lt;dependent_variable&gt;_prediction</code>.
</dd>
<dt>
<span class="term">
<code class="literal">randomize_seed</code>
</span>
</dt>
<dd>
(Optional, long)
Defines the seed to the random generator that is used to pick
which documents will be used for training. By default it is randomly generated.
Set it to a specific value to ensure the same documents are used for training
assuming other related parameters (e.g. <code class="literal">source</code>, <code class="literal">analyzed_fields</code>, etc.) are
the same.
</dd>
<dt>
<span class="term">
<code class="literal">training_percent</code>
</span>
</dt>
<dd>
(Optional, integer)
Defines what percentage of the eligible documents that will
be used for training. Documents that are ignored by the analysis (for example
those that contain arrays with more than one value) won’t be included in the
calculation for used percentage. Defaults to <code class="literal">100</code>.
</dd>
</dl>
</div>
</div>
</details>
</dd>
</dl>
</div>
</div>
</details>
</dd>
</dl>
</div>
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">analyzed_fields</code>
</span>
</dt>
<dd>
<p>
(Optional, object)
Specify <code class="literal">includes</code> and/or <code class="literal">excludes</code> patterns to select which fields will be
included in the analysis. The patterns specified in <code class="literal">excludes</code> are applied last,
therefore <code class="literal">excludes</code> takes precedence. In other words, if the same field is
specified in both <code class="literal">includes</code> and <code class="literal">excludes</code>, then the field will not be included
in the analysis.
</p>
<p><a id="dfa-supported-fields"></a>The supported fields for each type of analysis are as follows:</p>
<div class="ulist itemizedlist">
<ul class="itemizedlist">
<li class="listitem">
Outlier detection requires numeric or boolean data to analyze. The algorithms
don’t support missing values therefore fields that have data types other than
numeric or boolean are ignored. Documents where included fields contain missing
values, null values, or an array are also ignored. Therefore the <code class="literal">dest</code> index
may contain documents that don’t have an outlier score.
</li>
<li class="listitem">
Regression supports fields that are numeric, <code class="literal">boolean</code>, <code class="literal">text</code>,
<code class="literal">keyword</code>, and <code class="literal">ip</code>. It is also tolerant of missing values. Fields that are
supported are included in the analysis, other fields are ignored. Documents
where included fields contain  an array with two or more values are also
ignored. Documents in the <code class="literal">dest</code> index  that don’t contain a results field are
not included in the regression analysis.
</li>
<li class="listitem">
Classification supports fields that are numeric, <code class="literal">boolean</code>, <code class="literal">text</code>,
<code class="literal">keyword</code>, and <code class="literal">ip</code>. It is also tolerant of missing values. Fields that are
supported are included in the analysis, other fields are ignored. Documents
where included fields contain an array with two or more values are also ignored.
Documents in the <code class="literal">dest</code> index that don’t contain a results field are not
included in the classification analysis. Classification analysis can be improved by mapping
ordinal variable values to a  single number. For example, in case of age ranges,
you can model the values as "0-14" = 0, "15-24" = 1, "25-34" = 2, and so on.
</li>
</ul>
</div>
<p>If <code class="literal">analyzed_fields</code> is not set, only the relevant fields will be included. For
example, all the numeric fields for outlier detection. For more information about
field selection, see <a class="xref" href="explain-dfanalytics.html" title="Explain data frame analytics API">Explain data frame analytics API</a>.</p>
<details open>
<summary class="title">Properties of <code class="literal">analyzed_fields</code></summary>
<div class="content">
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">excludes</code>
</span>
</dt>
<dd>
(Optional, array)
An array of strings that defines the fields that will be excluded from the
analysis. You do not need to add fields with unsupported data types to
<code class="literal">excludes</code>, these fields are excluded from the analysis automatically.
</dd>
<dt>
<span class="term">
<code class="literal">includes</code>
</span>
</dt>
<dd>
(Optional, array)
An array of strings that defines the fields that will be included in the
analysis.
</dd>
</dl>
</div>
</div>
</details>
</dd>
<dt>
<span class="term">
<code class="literal">description</code>
</span>
</dt>
<dd>
(Optional, string)
A description of the job.
</dd>
<dt>
<span class="term">
<code class="literal">dest</code>
</span>
</dt>
<dd>
<p>
(Required, object)
The destination configuration, consisting of <code class="literal">index</code> and optionally
<code class="literal">results_field</code> (<code class="literal">ml</code> by default).
</p>
<details open>
<summary class="title">Properties of <code class="literal">dest</code></summary>
<div class="content">
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">index</code>
</span>
</dt>
<dd>
(Required, string) Defines the <em>destination index</em> to store the results of the
data frame analytics job.
</dd>
<dt>
<span class="term">
<code class="literal">results_field</code>
</span>
</dt>
<dd>
(Optional, string) Defines the name of the field in which to store the results
of the analysis. Defaults to <code class="literal">ml</code>.
</dd>
</dl>
</div>
</div>
</details>
</dd>
<dt>
<span class="term">
<code class="literal">model_memory_limit</code>
</span>
</dt>
<dd>
(Optional, string)
The approximate maximum amount of memory resources that are permitted for
analytical processing. The default value for data frame analytics jobs is <code class="literal">1gb</code>. If
your <code class="literal">elasticsearch.yml</code> file contains an <code class="literal">xpack.ml.max_model_memory_limit</code>
setting, an error occurs when you try to create data frame analytics jobs that have
<code class="literal">model_memory_limit</code> values greater than that setting. For more information, see
<a class="xref" href="ml-settings.html" title="Machine learning settings in Elasticsearch">Machine learning settings</a>.
</dd>
<dt>
<span class="term">
<code class="literal">source</code>
</span>
</dt>
<dd>
<p>
(object)
The configuration of how to source the analysis data. It requires an <code class="literal">index</code>.
Optionally, <code class="literal">query</code> and <code class="literal">_source</code> may be specified.
</p>
<details open>
<summary class="title">Properties of <code class="literal">source</code></summary>
<div class="content">
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">index</code>
</span>
</dt>
<dd>
<p>
(Required, string or array) Index or indices on which to perform the analysis.
It can be a single index or index pattern as well as an array of indices or
patterns.
</p>
<div class="warning admon">
<div class="icon"></div>
<div class="admon_content">
<p>If your source indices contain documents with the same IDs, only the
document that is indexed last appears in the destination index.</p>
</div>
</div>
</dd>
<dt>
<span class="term">
<code class="literal">query</code>
</span>
</dt>
<dd>
(Optional, object) The Elasticsearch query domain-specific language (<a class="xref" href="query-dsl.html" title="Query DSL">DSL</a>).
This value corresponds to the query object in an Elasticsearch search POST body. All the
options that are supported by Elasticsearch can be used, as this object is passed
verbatim to Elasticsearch. By default, this property has the following value:
<code class="literal">{"match_all": {}}</code>.
</dd>
<dt>
<span class="term">
<code class="literal">_source</code>
</span>
</dt>
<dd>
<p>
(Optional, object) Specify <code class="literal">includes</code> and/or <code class="literal">excludes</code> patterns to select which
fields will be present in the destination. Fields that are excluded cannot be
included in the analysis.
</p>
<details open>
<summary class="title">Properties of <code class="literal">_source</code></summary>
<div class="content">
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">includes</code>
</span>
</dt>
<dd>
(array) An array of strings that defines the fields that will be included in the
destination.
</dd>
<dt>
<span class="term">
<code class="literal">excludes</code>
</span>
</dt>
<dd>
(array) An array of strings that defines the fields that will be excluded from
the destination.
</dd>
</dl>
</div>
</div>
</details>
</dd>
</dl>
</div>
</div>
</details>
</dd>
</dl>
</div>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="ml-put-dfanalytics-example"></a>Examples<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch/edit/7.7/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc">edit</a>
</h3>
</div></div></div>
<div class="section">
<div class="titlepage"><div><div>
<h4 class="title">
<a id="ml-put-dfanalytics-example-preprocess"></a>Preprocessing actions example<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch/edit/7.7/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc">edit</a>
</h4>
</div></div></div>
<p>The following example shows how to limit the scope of the analysis to certain
fields, specify excluded fields in the destination index, and use a query to
filter your data before analysis.</p>
<div class="pre_wrapper lang-console">
<pre class="programlisting prettyprint lang-console">PUT _ml/data_frame/analytics/model-flight-delays-pre
{
  "source": {
    "index": [
      "kibana_sample_data_flights" <a id="CO594-1"></a><i class="conum" data-value="1"></i>
    ],
    "query": { <a id="CO594-2"></a><i class="conum" data-value="2"></i>
      "range": {
        "DistanceKilometers": {
          "gt": 0
        }
      }
    },
    "_source": { <a id="CO594-3"></a><i class="conum" data-value="3"></i>
      "includes": [],
      "excludes": [
        "FlightDelay",
        "FlightDelayType"
      ]
    }
  },
  "dest": { <a id="CO594-4"></a><i class="conum" data-value="4"></i>
    "index": "df-flight-delays",
    "results_field": "ml-results"
  },
  "analysis": {
  "regression": {
    "dependent_variable": "FlightDelayMin",
    "training_percent": 90
    }
  },
  "analyzed_fields": { <a id="CO594-5"></a><i class="conum" data-value="5"></i>
    "includes": [],
    "excludes": [
      "FlightNum"
    ]
  },
  "model_memory_limit": "100mb"
}</pre>
</div>
<div class="console_widget" data-snippet="snippets/1844.console"></div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO594-1"><i class="conum" data-value="1"></i></a></p>
</td>
<td align="left" valign="top">
<p>Source index to analyze.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO594-2"><i class="conum" data-value="2"></i></a></p>
</td>
<td align="left" valign="top">
<p>This query filters out entire documents that will not be present in the
destination index.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO594-3"><i class="conum" data-value="3"></i></a></p>
</td>
<td align="left" valign="top">
<p>The <code class="literal">_source</code> object defines fields in the dataset that will be included or
excluded in the destination index.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO594-4"><i class="conum" data-value="4"></i></a></p>
</td>
<td align="left" valign="top">
<p>Defines the destination index that contains the results of the analysis and
the fields of the source index specified in the <code class="literal">_source</code> object. Also defines
the name of the <code class="literal">results_field</code>.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO594-5"><i class="conum" data-value="5"></i></a></p>
</td>
<td align="left" valign="top">
<p>Specifies fields to be included in or excluded from the analysis. This does
not affect whether the fields will be present in the destination index, only
affects whether they are used in the analysis.</p>
</td>
</tr>
</table>
</div>
<p>In this example, we can see that all the fields of the source index are included
in the destination index except <code class="literal">FlightDelay</code> and <code class="literal">FlightDelayType</code> because
these are defined as excluded fields by the <code class="literal">excludes</code> parameter of the
<code class="literal">_source</code> object. The <code class="literal">FlightNum</code> field is included in the destination index,
however it is not included in the analysis because it is explicitly specified as
excluded field by the <code class="literal">excludes</code> parameter of the <code class="literal">analyzed_fields</code> object.</p>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h4 class="title">
<a id="ml-put-dfanalytics-example-od"></a>Outlier detection example<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch/edit/7.7/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc">edit</a>
</h4>
</div></div></div>
<p>The following example creates the <code class="literal">loganalytics</code> data frame analytics job, the analysis
type is <code class="literal">outlier_detection</code>:</p>
<div class="pre_wrapper lang-console">
<pre class="programlisting prettyprint lang-console">PUT _ml/data_frame/analytics/loganalytics
{
  "description": "Outlier detection on log data",
  "source": {
    "index": "logdata"
  },
  "dest": {
    "index": "logdata_out"
  },
  "analysis": {
    "outlier_detection": {
      "compute_feature_influence": true,
      "outlier_fraction": 0.05,
      "standardization_enabled": true
    }
  }
}</pre>
</div>
<div class="console_widget" data-snippet="snippets/1845.console"></div>
<p>The API returns the following result:</p>
<div class="pre_wrapper lang-console-result">
<pre class="programlisting prettyprint lang-console-result">{
    "id": "loganalytics",
    "description": "Outlier detection on log data",
    "source": {
        "index": ["logdata"],
        "query": {
            "match_all": {}
        }
    },
    "dest": {
        "index": "logdata_out",
        "results_field": "ml"
    },
    "analysis": {
        "outlier_detection": {
            "compute_feature_influence": true,
            "outlier_fraction": 0.05,
            "standardization_enabled": true
        }
    },
    "model_memory_limit": "1gb",
    "create_time" : 1562265491319,
    "version" : "7.6.0",
    "allow_lazy_start" : false
}</pre>
</div>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h4 class="title">
<a id="ml-put-dfanalytics-example-r"></a>Regression examples<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch/edit/7.7/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc">edit</a>
</h4>
</div></div></div>
<p>The following example creates the <code class="literal">house_price_regression_analysis</code>
data frame analytics job, the analysis type is <code class="literal">regression</code>:</p>
<div class="pre_wrapper lang-console">
<pre class="programlisting prettyprint lang-console">PUT _ml/data_frame/analytics/house_price_regression_analysis
{
  "source": {
    "index": "houses_sold_last_10_yrs"
  },
  "dest": {
    "index": "house_price_predictions"
  },
  "analysis":
    {
      "regression": {
        "dependent_variable": "price"
      }
    }
}</pre>
</div>
<div class="console_widget" data-snippet="snippets/1846.console"></div>
<p>The API returns the following result:</p>
<div class="pre_wrapper lang-console-result">
<pre class="programlisting prettyprint lang-console-result">{
  "id" : "house_price_regression_analysis",
  "source" : {
    "index" : [
      "houses_sold_last_10_yrs"
    ],
    "query" : {
      "match_all" : { }
    }
  },
  "dest" : {
    "index" : "house_price_predictions",
    "results_field" : "ml"
  },
  "analysis" : {
    "regression" : {
      "dependent_variable" : "price",
      "training_percent" : 100
    }
  },
  "model_memory_limit" : "1gb",
  "create_time" : 1567168659127,
  "version" : "8.0.0",
  "allow_lazy_start" : false
}</pre>
</div>
<p>The following example creates a job and specifies a training percent:</p>
<div class="pre_wrapper lang-console">
<pre class="programlisting prettyprint lang-console">PUT _ml/data_frame/analytics/student_performance_mathematics_0.3
{
 "source": {
   "index": "student_performance_mathematics"
 },
 "dest": {
   "index":"student_performance_mathematics_reg"
 },
 "analysis":
   {
     "regression": {
       "dependent_variable": "G3",
       "training_percent": 70,  <a id="CO595-1"></a><i class="conum" data-value="1"></i>
       "randomize_seed": 19673948271  <a id="CO595-2"></a><i class="conum" data-value="2"></i>
     }
   }
}</pre>
</div>
<div class="console_widget" data-snippet="snippets/1847.console"></div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO595-1"><i class="conum" data-value="1"></i></a></p>
</td>
<td align="left" valign="top">
<p>The percentage of the data set that is used for training the model.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO595-2"><i class="conum" data-value="2"></i></a></p>
</td>
<td align="left" valign="top">
<p>The seed that is used to randomly pick which data is used for training.</p>
</td>
</tr>
</table>
</div>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h4 class="title">
<a id="ml-put-dfanalytics-example-c"></a>Classification example<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch/edit/7.7/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc">edit</a>
</h4>
</div></div></div>
<p>The following example creates the <code class="literal">loan_classification</code> data frame analytics job, the
analysis type is <code class="literal">classification</code>:</p>
<div class="pre_wrapper lang-console">
<pre class="programlisting prettyprint lang-console">PUT _ml/data_frame/analytics/loan_classification
{
  "source" : {
    "index": "loan-applicants"
  },
  "dest" : {
    "index": "loan-applicants-classified"
  },
  "analysis" : {
    "classification": {
      "dependent_variable": "label",
      "training_percent": 75,
      "num_top_classes": 2
    }
  }
}</pre>
</div>
<div class="console_widget" data-snippet="snippets/1848.console"></div>
</div>

</div>

</div>
<div class="navfooter">
<span class="prev">
<a href="ml-df-analytics-apis.html">« Machine learning data frame analytics APIs</a>
</span>
<span class="next">
<a href="put-inference.html">Create inference trained model API »</a>
</span>
</div>
</div>

                  <!-- end body -->
                </div>
                <div class="col-xs-12 col-sm-4 col-md-4" id="right_col">
                  <div id="rtpcontainer" style="display: block;">
                    <div class="mktg-promo">
                      <h3>Most Popular</h3>
                      <ul class="icons">
                        <li class="icon-elasticsearch-white"><a href="https://www.elastic.co/webinars/getting-started-elasticsearch?baymax=default&amp;elektra=docs&amp;storm=top-video">Get Started with Elasticsearch: Video</a></li>
                        <li class="icon-kibana-white"><a href="https://www.elastic.co/webinars/getting-started-kibana?baymax=default&amp;elektra=docs&amp;storm=top-video">Intro to Kibana: Video</a></li>
                        <li class="icon-logstash-white"><a href="https://www.elastic.co/webinars/introduction-elk-stack?baymax=default&amp;elektra=docs&amp;storm=top-video">ELK for Logs &amp; Metrics: Video</a></li>
                      </ul>
                    </div>
                  </div>
                </div>
              </div>
            </div>
          </section>

        </div>


<div id="elastic-footer"></div>
<script src="https://www.elastic.co/elastic-footer.js"></script>
<!-- Footer Section end-->

      </section>
    </div>

<script src="/guide/static/jquery.js"></script>
<script type="text/javascript" src="/guide/static/docs.js"></script>
<script type="text/javascript">
  window.initial_state = {}</script>
  </body>
</html>
